#### PREAMBLE ####

library(ggplot2)
library(geosphere)
library(dplyr)
library(data.table)
library(tidyr)
library(zoo)
library(sf)
library(ggspatial)
library(RColorBrewer)
library(estimatr)
library(broom)
library(texreg)
library(did)


#### LOAD DATA ####

set.seed(20230711)

#load main data
df <- read.csv("Outputs/Analysis all muns.csv", stringsAsFactors = F, na.strings = c("", "NA"))
df$INEGI <- sprintf("%05d", df$INEGI)
df$INEGI <- as.character(df$INEGI)
df$time <- as.Date(df$time)

#### TRANSFORMATION ####

#states treated by US trade restrictions
unique(df$State[df$ustrade==1])
df$ustrade[df$time>=as.Date("2016-06-01") & df$trade==1 & (df$State=="MICHOACAN")] <- 1
#updating ustrade.
df$ustrade <- df$ustrade * df$trade

#### LOAD AND JOIN PUBLIC EXPENDITURE DATA ####

# Load necessary library
library(data.table) # Using data.table for faster file reading

# Define the path to the folder containing the CSV files
folder_path <- "Source data/Public expenditure/conjunto_de_datos"

# Define a pattern to match files ending with the years 2011 to 2019
year_pattern <- paste0("(", paste(2011:2019, collapse = "|"), ")\\.csv$")

# Get a list of all CSV files in the folder that match the year pattern
csv_files <- list.files(path = folder_path, pattern = year_pattern, full.names = TRUE)

# Load all matching CSV files into a list
csv_list <- lapply(csv_files, fread) # fread is used from data.table package for faster reading

combined_data <- rbindlist(csv_list)
gc()

spend <- combined_data
rm(combined_data, csv_list)
gc()

spend$ID_ENTIDAD <- sprintf("%02d", spend$ID_ENTIDAD)
spend$ID_MUNICIPIO <- sprintf("%03d", spend$ID_MUNICIPIO)
spend$INEGI <- paste0(spend$ID_ENTIDAD, spend$ID_MUNICIPIO)
spend$category <- paste0(spend$TEMA, spend$CATEGORIA, spend$DESCRIPCION_CATEGORIA)
spend$category <- gsub(" ", "", spend$category)
gc()

spend <- spend[,c("INEGI", "ANIO", "VALOR", "category")]
gc()

spend <- spend[spend$INEGI %in% df$INEGI,]

check <- spend %>% 
  dplyr::group_by(INEGI, ANIO, category) %>%
  dplyr::summarise(n = dplyr::n(), .groups = "drop") %>%
  dplyr::filter(n > 1L) 

spend <- spend[!spend$category %in% check$category,]

rm(check)
gc()

wide_format <- spend %>%
  pivot_wider(values_from="VALOR", names_from = "category")

rm(spend)
gc()

#summarize up to municipal level
variables <- colnames(wide_format)[3:52]
average_data <- wide_format %>%
  group_by(INEGI) %>%
  summarize_at(variables, mean, na.rm = TRUE)

spend <- average_data

rm(average_data, wide_format)
gc()


#### SUMMARIZE AND MERGE DF ####

#summarize up to municipal level
variables <- c("long", "lat", "trade", "min_dist", "malicious_homicides")
df[variables] <- lapply(df[variables], function(x) {
  if(!is.numeric(x)) as.numeric(gsub(",", "", x)) else x
})
average_data <- df %>%
  group_by(INEGI) %>%
  summarize_at(variables, mean, na.rm = TRUE)

for (col in colnames(average_data)) {
  average_data[[col]][is.nan(average_data[[col]])] <- NA_real_
}

average_data$trade[average_data$trade>0] <- 1

#combine datasets
df <- left_join(average_data, spend, by=c("INEGI"="INEGI"))

objects <- ls()
rm(list = objects[objects != "df"])
rm(objects)
gc()

#### GENETIC MATCHING ####

library(Matching)
library(optmatch)   
library(cobalt)   

#pare down to remove nas and variables with too much missing
df <- df[!df$min_dist=="Inf",]
na_counts <- colSums(is.na(df))

df <- df[, colSums(is.na(df)) <= 200] #subset to variables with less than 200 nas

na_counts <- colSums(is.na(df[df$trade==1,])) #check nas where trade ==1

df <- df[!(df$trade == 0 & rowSums(is.na(df)) > 0), ] #remove incomplete rows where trade is zero

df <- df[, colSums(is.na(df)) <= 1] #subset to variables with 1 or fewer nas

na_counts <- colSums(is.na(df))

#subset to complete rows
df <- df[!(rowSums(is.na(df)) > 0), ]
na_counts <- colSums(is.na(df))
sum(na_counts)==0

#set all to numeric
df[-which(names(df) == "INEGI")] <- lapply(df[-which(names(df) == "INEGI")], as.numeric)

treatment <- df$trade
covariates <- df[, !colnames(df) %in% c("INEGI", "trade", "malicious_homicides")]
outcome <- df$malicious_homicides

# Define a balance matrix
X <- as.matrix(covariates)

# Perform genetic matching
library(rgenoud)
genetic_match <- GenMatch(Tr = treatment, X = X, M = 1, pop.size = 100, max.generations = 10, wait.generations = 1)

# Use the matching result to perform matching
matched <- Match(Y = outcome, Tr = treatment, X = X, Weight.matrix = genetic_match)


### ASSESS ###

# Summary of the matching result
summary(matched)

# Balance diagnostics using cobalt
bal.tab(matched, treat = treatment, covs = covariates, method = "genetic")


#### SUBSET FOR REGRESSION ####

treated_indices <- matched$index.treated
control_indices <- matched$index.control

# Create a mapping of treated to control units
matched_pairs <- data.frame(
  treated = treated_indices,
  control = control_indices
)

# Optionally, add the INEGI identifiers to the mapping
matched_pairs$treated_INEGI <- df$INEGI[treated_indices]
matched_pairs$control_INEGI <- df$INEGI[control_indices]

# Save the mapping to a CSV file
write.csv(matched_pairs, "Outputs/genetic_matched_pairs.csv", row.names = FALSE)

# Subset the original dataset to get the matched dataset
matched_data <- df[c(treated_indices, control_indices), ]

#save inegis to use
write.csv(matched_data[,c("INEGI")], "Outputs/genetic_matched_muns.csv", row.names = F)

rm(list=ls())
gc()




